CONTEXT:
The data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy import stats
from scipy.stats import zscore
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(color_codes=True)
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
#Loading the json file and concatinating it with car names
ca=pd.read_json (r'./Part1+-+Car-Attributes.json')
ca1=pd.read_csv('./Part1+-+Car+name.csv')
car=pd.concat([ca,ca1],axis=1)
car.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | car_name | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 | chevrolet chevelle malibu |
| 1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 | buick skylark 320 |
| 2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 | plymouth satellite |
| 3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 | amc rebel sst |
| 4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 | ford torino |
row, column = car.shape
print('The dataset contains', row, 'rows and', column, 'columns')
The dataset contains 398 rows and 9 columns
# save this data as to csv,xlsx and json
car.to_csv('mpg.csv', index=False)
car.to_excel('mpg.xlsx', index = False)
car.to_json('mpg.json', orient = 'split', compression = 'infer', index = 'true')
#dropping/ignoring car_name
car = car.drop('car_name', axis=1)
# Also replacing the categorical var with actual values
car['origin'] = car['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
car.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | america |
| 1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | america |
| 2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | america |
| 3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | america |
| 4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | america |
print('The data type of each attribute: \n')
car.info()
The data type of each attribute: <class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 398 non-null float64 1 cyl 398 non-null int64 2 disp 398 non-null float64 3 hp 398 non-null object 4 wt 398 non-null int64 5 acc 398 non-null float64 6 yr 398 non-null int64 7 origin 398 non-null object dtypes: float64(3), int64(3), object(2) memory usage: 25.0+ KB
#5-point summary
car.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| mpg | 398.0 | 23.514573 | 7.815984 | 9.0 | 17.500 | 23.0 | 29.000 | 46.6 |
| cyl | 398.0 | 5.454774 | 1.701004 | 3.0 | 4.000 | 4.0 | 8.000 | 8.0 |
| disp | 398.0 | 193.425879 | 104.269838 | 68.0 | 104.250 | 148.5 | 262.000 | 455.0 |
| wt | 398.0 | 2970.424623 | 846.841774 | 1613.0 | 2223.750 | 2803.5 | 3608.000 | 5140.0 |
| acc | 398.0 | 15.568090 | 2.757689 | 8.0 | 13.825 | 15.5 | 17.175 | 24.8 |
| yr | 398.0 | 76.010050 | 3.697627 | 70.0 | 73.000 | 76.0 | 79.000 | 82.0 |
# isdigit()? on 'horsepower'
hpIsDigit = pd.DataFrame(car.hp.str.isdigit())
#print isDigit = False!
car[hpIsDigit['hp'] == False]
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 32 | 25.0 | 4 | 98.0 | ? | 2046 | 19.0 | 71 | america |
| 126 | 21.0 | 6 | 200.0 | ? | 2875 | 17.0 | 74 | america |
| 330 | 40.9 | 4 | 85.0 | ? | 1835 | 17.3 | 80 | europe |
| 336 | 23.6 | 4 | 140.0 | ? | 2905 | 14.3 | 80 | america |
| 354 | 34.5 | 4 | 100.0 | ? | 2320 | 15.8 | 81 | europe |
| 374 | 23.0 | 4 | 151.0 | ? | 3035 | 20.5 | 82 | america |
# Replace missing values with NaN
car = car.replace('?', np.nan)
car[hpIsDigit['hp'] == False]
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 32 | 25.0 | 4 | 98.0 | NaN | 2046 | 19.0 | 71 | america |
| 126 | 21.0 | 6 | 200.0 | NaN | 2875 | 17.0 | 74 | america |
| 330 | 40.9 | 4 | 85.0 | NaN | 1835 | 17.3 | 80 | europe |
| 336 | 23.6 | 4 | 140.0 | NaN | 2905 | 14.3 | 80 | america |
| 354 | 34.5 | 4 | 100.0 | NaN | 2320 | 15.8 | 81 | europe |
| 374 | 23.0 | 4 | 151.0 | NaN | 3035 | 20.5 | 82 | america |
#replace the missing values with median value.
car.median()
mpg 23.0 cyl 4.0 disp 148.5 hp 93.5 wt 2803.5 acc 15.5 yr 76.0 dtype: float64
#replace hp with median
car['hp'].fillna((car['hp'].median()), inplace=True)
print('The data set has no missing values \n')
car.isnull().sum()
The data set has no missing values
mpg 0 cyl 0 disp 0 hp 0 wt 0 acc 0 yr 0 origin 0 dtype: int64
#creating mpg_level attribute using the 25% and 75% of mpg values( < 17 as low, > 17 and <29 as medium, > 29 as high )
car['mpg_level'] = car['mpg'].apply(lambda x: 'low' if x<17 else 'high' if x>29 else 'medium')
car.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | mpg_level | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | america | medium |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | america | low |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | america | medium |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | america | low |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | america | medium |
#categorical variable
car_cat = car.iloc[:,[1,6,7,8]]
car_cat.head()
| cyl | yr | origin | mpg_level | |
|---|---|---|---|---|
| 0 | 8 | 70 | america | medium |
| 1 | 8 | 70 | america | low |
| 2 | 8 | 70 | america | medium |
| 3 | 8 | 70 | america | low |
| 4 | 8 | 70 | america | medium |
#numeric variables
car_num=car.drop(['cyl','yr','origin','mpg_level'],axis=1)
car_num.head()
| mpg | disp | hp | wt | acc | |
|---|---|---|---|---|---|
| 0 | 18.0 | 307.0 | 130.0 | 3504 | 12.0 |
| 1 | 15.0 | 350.0 | 165.0 | 3693 | 11.5 |
| 2 | 18.0 | 318.0 | 150.0 | 3436 | 11.0 |
| 3 | 16.0 | 304.0 | 150.0 | 3433 | 12.0 |
| 4 | 17.0 | 302.0 | 140.0 | 3449 | 10.5 |
#plotting categorical variables
fig = plt.figure(1, (14, 8))
for i,car in enumerate(car_cat.columns):
ax = plt.subplot(2,2,i+1)
sns.countplot(car_cat[car], order=car_cat[car].value_counts().index)
ax.set_xlabel(None)
ax.set_title(f'Distribution of {car}')
plt.tight_layout()
plt.show()
#plot histograms
car_num.hist(bins = 20, figsize = (10, 8), color = 'blue')
plt.show()
#plot density
plt.figure(figsize=(17, 13))
col = 1
for i in car_num.columns:
plt.subplot(3, 3, col)
sns.distplot(car_num[i], color = 'b')
col += 1
#joining the categorical and numerical variables
car=pd.concat([car_cat,car_num],axis=1)
#checking for attribute type
car.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 cyl 398 non-null int64 1 yr 398 non-null int64 2 origin 398 non-null object 3 mpg_level 398 non-null object 4 mpg 398 non-null float64 5 disp 398 non-null float64 6 hp 398 non-null float64 7 wt 398 non-null int64 8 acc 398 non-null float64 dtypes: float64(4), int64(3), object(2) memory usage: 28.1+ KB
#create dummy varibale for mpg_level and origin and creating a seperate datset for k-means and heirarchical clustering
car = pd.get_dummies(car, columns=['origin'])
car = pd.get_dummies(car, columns=['mpg_level'])
car.head()
carH=car.copy()
carK=car.copy()
#pair plot for the numeric attributes
car_attr = car.iloc[:, 0:7]
sns.pairplot(car_attr, diag_kind='kde');
#dropping the created dummy variable
car2=car.drop(['origin_america','origin_asia','origin_europe','mpg_level_high','mpg_level_low','mpg_level_medium'],axis=1)
#checking of outliers
plt.figure(figsize=(25, 20))
col = 1
for i in car2.columns:
plt.subplot(3, 3, col)
sns.boxplot(car2[i],color='blue')
col += 1
#replacing outliers with IQR (Q1 and Q3 +-1.5*IQR)
IQR1 = stats.iqr(car2['hp'], interpolation = 'midpoint')
IQR2 = stats.iqr(car2['acc'], interpolation = 'midpoint')
#Horsepower after imputing outliers
Q3 = car2['hp'].quantile(0.75)
car2['hp'] = np.where(car2["hp"] >(Q3+1.5*IQR1), 198.5,car2['hp'])
sns.boxplot(car2['hp']);
#accelaration after imputing outliers
Q1 = car2['acc'].quantile(0.25)
Q31=car2['acc'].quantile(0.75)
car2['acc'] = np.where(car2["acc"] >(Q31+1.5*IQR2),22.10 ,car2['acc'])
car2['acc'] = np.where(car2["acc"] <(Q1-1.5*IQR2),(Q1-1.5*IQR2),car2['acc'])
sns.boxplot(car2['acc']);
#checking for correlation
plt.figure(figsize=(10,8))
corr=car2.corr()
sns.heatmap(corr,annot=True);
Heirarchical Clustering
#separating numeric variables
cc = car.iloc[:,0:7]
cc.head()
| cyl | yr | mpg | disp | hp | wt | acc | |
|---|---|---|---|---|---|---|---|
| 0 | 8 | 70 | 18.0 | 307.0 | 130.0 | 3504 | 12.0 |
| 1 | 8 | 70 | 15.0 | 350.0 | 165.0 | 3693 | 11.5 |
| 2 | 8 | 70 | 18.0 | 318.0 | 150.0 | 3436 | 11.0 |
| 3 | 8 | 70 | 16.0 | 304.0 | 150.0 | 3433 | 12.0 |
| 4 | 8 | 70 | 17.0 | 302.0 | 140.0 | 3449 | 10.5 |
#scaling the variable
cc_z = cc.apply(zscore)
cc_z.head()
| cyl | yr | mpg | disp | hp | wt | acc | |
|---|---|---|---|---|---|---|---|
| 0 | 1.498191 | -1.627426 | -0.706439 | 1.090604 | 0.673118 | 0.630870 | -1.295498 |
| 1 | 1.498191 | -1.627426 | -1.090751 | 1.503514 | 1.589958 | 0.854333 | -1.477038 |
| 2 | 1.498191 | -1.627426 | -0.706439 | 1.196232 | 1.197027 | 0.550470 | -1.658577 |
| 3 | 1.498191 | -1.627426 | -0.962647 | 1.061796 | 1.197027 | 0.546923 | -1.295498 |
| 4 | 1.498191 | -1.627426 | -0.834543 | 1.042591 | 0.935072 | 0.565841 | -1.840117 |
#calculating pairwise distance using average linkage method
link_method = linkage(cc_z.iloc[:,0:7], method = 'average')
#plotting the H-cluster
plt.figure(figsize=(25, 10))
dendrogram(link_method)
plt.show()
Appers to be to much of a visual clutter, we'll go ahead and cut down the dendrogram to give us 2 clusters/groups
# dendrogram function to arrive at dendrogram
dendrogram(
link_method,
truncate_mode='lastp',
p=2,
)
plt.show()
#vieweing the clusters formed
clusters = fcluster(link_method, 2, criterion='maxclust')
clusters
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1,
1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2,
2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2,
2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2], dtype=int32)
#attaching the clusters formed to the scales data
cc_z['clusters_H'] = clusters
cc_z.head()
| cyl | yr | mpg | disp | hp | wt | acc | clusters_H | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1.498191 | -1.627426 | -0.706439 | 1.090604 | 0.673118 | 0.630870 | -1.295498 | 1 |
| 1 | 1.498191 | -1.627426 | -1.090751 | 1.503514 | 1.589958 | 0.854333 | -1.477038 | 1 |
| 2 | 1.498191 | -1.627426 | -0.706439 | 1.196232 | 1.197027 | 0.550470 | -1.658577 | 1 |
| 3 | 1.498191 | -1.627426 | -0.962647 | 1.061796 | 1.197027 | 0.546923 | -1.295498 | 1 |
| 4 | 1.498191 | -1.627426 | -0.834543 | 1.042591 | 0.935072 | 0.565841 | -1.840117 | 1 |
#vieweing the distribution of clusters
cc_z.clusters_H.value_counts().sort_index()
1 100 2 298 Name: clusters_H, dtype: int64
#attaching the clusters formed to the original data
cc['clusters_H']=clusters
carH['clusters_H']=clusters
cc.head()
| cyl | yr | mpg | disp | hp | wt | acc | clusters_H | |
|---|---|---|---|---|---|---|---|---|
| 0 | 8 | 70 | 18.0 | 307.0 | 130.0 | 3504 | 12.0 | 1 |
| 1 | 8 | 70 | 15.0 | 350.0 | 165.0 | 3693 | 11.5 | 1 |
| 2 | 8 | 70 | 18.0 | 318.0 | 150.0 | 3436 | 11.0 | 1 |
| 3 | 8 | 70 | 16.0 | 304.0 | 150.0 | 3433 | 12.0 | 1 |
| 4 | 8 | 70 | 17.0 | 302.0 | 140.0 | 3449 | 10.5 | 1 |
#create a new data set named Hclus
Hclus=cc
Hclus.head()
| cyl | yr | mpg | disp | hp | wt | acc | clusters_H | |
|---|---|---|---|---|---|---|---|---|
| 0 | 8 | 70 | 18.0 | 307.0 | 130.0 | 3504 | 12.0 | 1 |
| 1 | 8 | 70 | 15.0 | 350.0 | 165.0 | 3693 | 11.5 | 1 |
| 2 | 8 | 70 | 18.0 | 318.0 | 150.0 | 3436 | 11.0 | 1 |
| 3 | 8 | 70 | 16.0 | 304.0 | 150.0 | 3433 | 12.0 | 1 |
| 4 | 8 | 70 | 17.0 | 302.0 | 140.0 | 3449 | 10.5 | 1 |
#aggregating the numerical variable with the clusters formed with the mean
aggdata=cc.iloc[:,0:8].groupby('clusters_H').mean()
aggdata['Freq']=cc.clusters_H.value_counts().sort_index()
aggdata
| cyl | yr | mpg | disp | hp | wt | acc | Freq | |
|---|---|---|---|---|---|---|---|---|
| clusters_H | ||||||||
| 1 | 7.980000 | 73.740000 | 14.684000 | 345.470000 | 160.400000 | 4121.560000 | 12.702000 | 100 |
| 2 | 4.607383 | 76.771812 | 26.477852 | 142.404362 | 85.479866 | 2584.137584 | 16.529866 | 298 |
#plotting the clusters formed
plt.figure(figsize=(10, 8))
sns.scatterplot(x="mpg", y="hp", hue="clusters_H",
data=cc_z,
palette=['green','brown']);
K-Means Clustering
#seperating the numeric values
cc = car.iloc[:,0:7]
cc_z1 = cc.apply(zscore)
cc_z1.head()
| cyl | yr | mpg | disp | hp | wt | acc | |
|---|---|---|---|---|---|---|---|
| 0 | 1.498191 | -1.627426 | -0.706439 | 1.090604 | 0.673118 | 0.630870 | -1.295498 |
| 1 | 1.498191 | -1.627426 | -1.090751 | 1.503514 | 1.589958 | 0.854333 | -1.477038 |
| 2 | 1.498191 | -1.627426 | -0.706439 | 1.196232 | 1.197027 | 0.550470 | -1.658577 |
| 3 | 1.498191 | -1.627426 | -0.962647 | 1.061796 | 1.197027 | 0.546923 | -1.295498 |
| 4 | 1.498191 | -1.627426 | -0.834543 | 1.042591 | 0.935072 | 0.565841 | -1.840117 |
#calculatint the within sum of squares
wss =[]
for i in range(1,5):
KM = KMeans(n_clusters=i)
KM.fit(cc_z1)
wss.append(KM.inertia_)
wss
[2785.9999999999995, 1294.841895072732, 946.0197908553794, 738.3743876111234]
#plotting the WSS against the number of cluster to come up with optimal number of clusters using Elbow-method
plt.plot(range(1,5), wss);
plt.title('Elbow Method');
plt.xlabel("Number of Clusters")
plt.ylabel("WSS");
#using 2 centroids for clustering
k_means = KMeans(n_clusters = 2)
k_means.fit(cc_z1)
labels = k_means.labels_
# Calculating silhouette_score
silhouette_score(cc_z1,labels)
0.48235946103916116
#calculating silhouette score for different centroids
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": 42,
}
silhouette_coefficients = []
# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 7):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(cc_z1)
score = silhouette_score(cc_z1,kmeans.labels_)
silhouette_coefficients.append(score)
#plotting silhouette score for different centroids
plt.plot(range(2, 7), silhouette_coefficients)
plt.xticks(range(2, 7))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()
#attaching the labels to the datasets
cc["cluster_K"] = labels
carK['cluster_K']=labels
Kclus=cc
Kclus.head()
| cyl | yr | mpg | disp | hp | wt | acc | cluster_K | |
|---|---|---|---|---|---|---|---|---|
| 0 | 8 | 70 | 18.0 | 307.0 | 130.0 | 3504 | 12.0 | 1 |
| 1 | 8 | 70 | 15.0 | 350.0 | 165.0 | 3693 | 11.5 | 1 |
| 2 | 8 | 70 | 18.0 | 318.0 | 150.0 | 3436 | 11.0 | 1 |
| 3 | 8 | 70 | 16.0 | 304.0 | 150.0 | 3433 | 12.0 | 1 |
| 4 | 8 | 70 | 17.0 | 302.0 | 140.0 | 3449 | 10.5 | 1 |
#vieweing the distribution of clusters
cc.cluster_K.value_counts().sort_index()
0 293 1 105 Name: cluster_K, dtype: int64
#attaching the clsuters to the scaled data
cc_z1["cluster_K"] = labels
cc_z1.head()
| cyl | yr | mpg | disp | hp | wt | acc | cluster_K | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1.498191 | -1.627426 | -0.706439 | 1.090604 | 0.673118 | 0.630870 | -1.295498 | 1 |
| 1 | 1.498191 | -1.627426 | -1.090751 | 1.503514 | 1.589958 | 0.854333 | -1.477038 | 1 |
| 2 | 1.498191 | -1.627426 | -0.706439 | 1.196232 | 1.197027 | 0.550470 | -1.658577 | 1 |
| 3 | 1.498191 | -1.627426 | -0.962647 | 1.061796 | 1.197027 | 0.546923 | -1.295498 | 1 |
| 4 | 1.498191 | -1.627426 | -0.834543 | 1.042591 | 0.935072 | 0.565841 | -1.840117 | 1 |
#aggregating the numerical variable with the clusters formed with the mean
aggdata=cc.iloc[:,0:8].groupby('cluster_K').mean()
aggdata['Freq']=cc.cluster_K.value_counts().sort_index()
aggdata
| cyl | yr | mpg | disp | hp | wt | acc | Freq | |
|---|---|---|---|---|---|---|---|---|
| cluster_K | ||||||||
| 0 | 4.569966 | 76.822526 | 26.619113 | 140.250853 | 85.061433 | 2567.860068 | 16.535836 | 293 |
| 1 | 7.923810 | 73.742857 | 14.851429 | 341.809524 | 158.000000 | 4093.771429 | 12.867619 | 105 |
Clearly shows two disting group with a difference in average between the clusters and variables
#plotting the clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(x="mpg", y="hp", hue="cluster_K",
data=cc_z1,
palette=['green','brown']);
carH.clusters_H.value_counts().sort_index()
1 100 2 298 Name: clusters_H, dtype: int64
carK.cluster_K.value_counts().sort_index()
0 293 1 105 Name: cluster_K, dtype: int64
carH.shape
(398, 14)
carK.shape
(398, 14)
car.head()
| cyl | yr | mpg | disp | hp | wt | acc | origin_america | origin_asia | origin_europe | mpg_level_high | mpg_level_low | mpg_level_medium | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8 | 70 | 18.0 | 307.0 | 130.0 | 3504 | 12.0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 1 | 8 | 70 | 15.0 | 350.0 | 165.0 | 3693 | 11.5 | 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 8 | 70 | 18.0 | 318.0 | 150.0 | 3436 | 11.0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 3 | 8 | 70 | 16.0 | 304.0 | 150.0 | 3433 | 12.0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 4 | 8 | 70 | 17.0 | 302.0 | 140.0 | 3449 | 10.5 | 1 | 0 | 0 | 0 | 0 | 1 |
Linear regression on the original dataset
X = car.drop(['mpg','origin_europe','mpg_level_low'], axis=1)
# the dependent variable
y = car[['mpg']]
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=15)
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
LinearRegression()
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
The coefficient for cyl is -0.5134441386218828 The coefficient for yr is 0.4434650429116842 The coefficient for disp is 0.010688858394646887 The coefficient for hp is 0.010315514536314008 The coefficient for wt is -0.004538788568737129 The coefficient for acc is 0.19183425608862537 The coefficient for origin_america is -1.7306209513688993 The coefficient for origin_asia is -0.8976724344009405 The coefficient for mpg_level_high is 8.552374663817027 The coefficient for mpg_level_medium is 1.5941218694850492
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
The intercept for our model is -1.6635717568652169
regression_model.score(X_train, y_train)
0.8967703023839786
O=regression_model.score(X_test, y_test)
O
0.9037421476349174
Linear regression on data with K means cluster
#renaming the cluster labels to light and heavy vehicles and creating dummy variables of it
carK['cluster_K']=carK['cluster_K'].astype('category')
carK['cluster_K'] = carK['cluster_K'].replace({1: 'heavy', 0: 'light'})
carK = pd.get_dummies(carK, columns=['cluster_K'])
carK.head()
| cyl | yr | mpg | disp | hp | wt | acc | origin_america | origin_asia | origin_europe | mpg_level_high | mpg_level_low | mpg_level_medium | cluster_K_light | cluster_K_heavy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8 | 70 | 18.0 | 307.0 | 130.0 | 3504 | 12.0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 1 | 8 | 70 | 15.0 | 350.0 | 165.0 | 3693 | 11.5 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 2 | 8 | 70 | 18.0 | 318.0 | 150.0 | 3436 | 11.0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 3 | 8 | 70 | 16.0 | 304.0 | 150.0 | 3433 | 12.0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 4 | 8 | 70 | 17.0 | 302.0 | 140.0 | 3449 | 10.5 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
X = carK.drop(['mpg','origin_europe','mpg_level_low','cluster_K_light'], axis=1)
# the dependent variable
y = carK[['mpg']]
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=12)
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
LinearRegression()
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
The coefficient for cyl is -1.1945995644777805 The coefficient for yr is 0.43186510415059826 The coefficient for disp is 0.017477496279110098 The coefficient for hp is -0.010138045835905891 The coefficient for wt is -0.0040684301693864056 The coefficient for acc is 0.1856482874624993 The coefficient for origin_america is -1.6918315494304086 The coefficient for origin_asia is -0.7407779192303001 The coefficient for mpg_level_high is 9.28312093915688 The coefficient for mpg_level_medium is 2.25000171423125 The coefficient for cluster_K_heavy is 2.511514014338475
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
The intercept for our model is 1.2041468067173469
regression_model.score(X_train, y_train)
0.8942370456543635
K=regression_model.score(X_test, y_test)
K
0.9117893808052382
Linear regression on data with H-clusters
#renaming the cluster labels to light and heavy vehicles and creating summy variable of it
carH['clusters_H']=carH['clusters_H'].astype('category')
carH['clusters_H'] = carH['clusters_H'].replace({1: 'heavy', 2: 'light'})
carH = pd.get_dummies(carH, columns=['clusters_H'])
X = carH.drop(['mpg','origin_europe','mpg_level_low','clusters_H_light'], axis=1)
# the dependent variable
y = carH[['mpg']]
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
LinearRegression()
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
The coefficient for cyl is -1.0104832432576671 The coefficient for yr is 0.4475417357550161 The coefficient for disp is 0.015115200524614403 The coefficient for hp is -0.013301584387234493 The coefficient for wt is -0.00426417978067245 The coefficient for acc is 0.11805139164484575 The coefficient for origin_america is -2.1174569315391154 The coefficient for origin_asia is -1.3974915348558108 The coefficient for mpg_level_high is 8.565948239298274 The coefficient for mpg_level_medium is 1.6577250698582813 The coefficient for clusters_H_heavy is 2.038974468807404
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
The intercept for our model is 2.572729318233023
regression_model.score(X_train, y_train)
0.8988409890950728
H=regression_model.score(X_test, y_test)
H
0.9010238373846703
modellists = []
modellists.append(['Linear Regression on Original Data set', O*100])
modellists.append(['Linear Regression with K means clusters', K*100])
modellists.append(['Linear Regression with Heirarchical clusters', H*100])
mdl_df = pd.DataFrame(modellists, columns = ['Model','r^2 on Test'])
mdl_df
| Model | r^2 on Test | |
|---|---|---|
| 0 | Linear Regression on Original Data set | 90.374215 |
| 1 | Linear Regression with K means clusters | 91.178938 |
| 2 | Linear Regression with Heirarchical clusters | 90.102384 |
Summary:
K-means appears to explain the highest variation in the datset, but with a difference of only 1% when compared with other models, to get more clarity a larger dataset may be used, since this is a dataset of used cars it doesn't give us how many previous owners has the cars seen which might be helful variable,the gender of the previous owners, the reason/purpose that the cars were being used is also an important factor which the dataset doen't capture. With the above mentioned features it may be possible to get a higher accuracy or explainability of the models and its variables.
CONTEXT: The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn import metrics
#import the dataset
ve = pd.read_csv('./vehicle.csv')
ve.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 48.0 | 83.0 | 178.0 | 72.0 | 10 | 162.0 | 42.0 | 20.0 | 159 | 176.0 | 379.0 | 184.0 | 70.0 | 6.0 | 16.0 | 187.0 | 197 | van |
| 1 | 91 | 41.0 | 84.0 | 141.0 | 57.0 | 9 | 149.0 | 45.0 | 19.0 | 143 | 170.0 | 330.0 | 158.0 | 72.0 | 9.0 | 14.0 | 189.0 | 199 | van |
| 2 | 104 | 50.0 | 106.0 | 209.0 | 66.0 | 10 | 207.0 | 32.0 | 23.0 | 158 | 223.0 | 635.0 | 220.0 | 73.0 | 14.0 | 9.0 | 188.0 | 196 | car |
| 3 | 93 | 41.0 | 82.0 | 159.0 | 63.0 | 9 | 144.0 | 46.0 | 19.0 | 143 | 160.0 | 309.0 | 127.0 | 63.0 | 6.0 | 10.0 | 199.0 | 207 | van |
| 4 | 85 | 44.0 | 70.0 | 205.0 | 103.0 | 52 | 149.0 | 45.0 | 19.0 | 144 | 241.0 | 325.0 | 188.0 | 127.0 | 9.0 | 11.0 | 180.0 | 183 | bus |
#checking for the dimension of the data
rows, column = ve.shape
print('The dataset contains', rows, 'rows and', column, 'columns.')
The dataset contains 846 rows and 19 columns.
#checking for the data type
print('The data type of each attribute: \n')
ve.info()
The data type of each attribute: <class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null int64 1 circularity 841 non-null float64 2 distance_circularity 842 non-null float64 3 radius_ratio 840 non-null float64 4 pr.axis_aspect_ratio 844 non-null float64 5 max.length_aspect_ratio 846 non-null int64 6 scatter_ratio 845 non-null float64 7 elongatedness 845 non-null float64 8 pr.axis_rectangularity 843 non-null float64 9 max.length_rectangularity 846 non-null int64 10 scaled_variance 843 non-null float64 11 scaled_variance.1 844 non-null float64 12 scaled_radius_of_gyration 844 non-null float64 13 scaled_radius_of_gyration.1 842 non-null float64 14 skewness_about 840 non-null float64 15 skewness_about.1 845 non-null float64 16 skewness_about.2 845 non-null float64 17 hollows_ratio 846 non-null int64 18 class 846 non-null object dtypes: float64(14), int64(4), object(1) memory usage: 125.7+ KB
print('Duplicated rows: ', ve[ve.duplicated()].shape[0])
Duplicated rows: 0
print('Null values:\n', ve.isnull().sum())
Null values: compactness 0 circularity 5 distance_circularity 4 radius_ratio 6 pr.axis_aspect_ratio 2 max.length_aspect_ratio 0 scatter_ratio 1 elongatedness 1 pr.axis_rectangularity 3 max.length_rectangularity 0 scaled_variance 3 scaled_variance.1 2 scaled_radius_of_gyration 2 scaled_radius_of_gyration.1 4 skewness_about 6 skewness_about.1 1 skewness_about.2 1 hollows_ratio 0 class 0 dtype: int64
#replacing the missing values with median values
for cols in ve.columns:
if(cols != 'class'):
ve[cols] = ve[cols].fillna(ve[cols].median())
#5-point summary
ve.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| compactness | 846.0 | 93.678487 | 8.234474 | 73.0 | 87.00 | 93.0 | 100.00 | 119.0 |
| circularity | 846.0 | 44.823877 | 6.134272 | 33.0 | 40.00 | 44.0 | 49.00 | 59.0 |
| distance_circularity | 846.0 | 82.100473 | 15.741569 | 40.0 | 70.00 | 80.0 | 98.00 | 112.0 |
| radius_ratio | 846.0 | 168.874704 | 33.401356 | 104.0 | 141.00 | 167.0 | 195.00 | 333.0 |
| pr.axis_aspect_ratio | 846.0 | 61.677305 | 7.882188 | 47.0 | 57.00 | 61.0 | 65.00 | 138.0 |
| max.length_aspect_ratio | 846.0 | 8.567376 | 4.601217 | 2.0 | 7.00 | 8.0 | 10.00 | 55.0 |
| scatter_ratio | 846.0 | 168.887707 | 33.197710 | 112.0 | 147.00 | 157.0 | 198.00 | 265.0 |
| elongatedness | 846.0 | 40.936170 | 7.811882 | 26.0 | 33.00 | 43.0 | 46.00 | 61.0 |
| pr.axis_rectangularity | 846.0 | 20.580378 | 2.588558 | 17.0 | 19.00 | 20.0 | 23.00 | 29.0 |
| max.length_rectangularity | 846.0 | 147.998818 | 14.515652 | 118.0 | 137.00 | 146.0 | 159.00 | 188.0 |
| scaled_variance | 846.0 | 188.596927 | 31.360427 | 130.0 | 167.00 | 179.0 | 217.00 | 320.0 |
| scaled_variance.1 | 846.0 | 439.314421 | 176.496341 | 184.0 | 318.25 | 363.5 | 586.75 | 1018.0 |
| scaled_radius_of_gyration | 846.0 | 174.706856 | 32.546277 | 109.0 | 149.00 | 173.5 | 198.00 | 268.0 |
| scaled_radius_of_gyration.1 | 846.0 | 72.443262 | 7.468734 | 59.0 | 67.00 | 71.5 | 75.00 | 135.0 |
| skewness_about | 846.0 | 6.361702 | 4.903244 | 0.0 | 2.00 | 6.0 | 9.00 | 22.0 |
| skewness_about.1 | 846.0 | 12.600473 | 8.930962 | 0.0 | 5.00 | 11.0 | 19.00 | 41.0 |
| skewness_about.2 | 846.0 | 188.918440 | 6.152247 | 176.0 | 184.00 | 188.0 | 193.00 | 206.0 |
| hollows_ratio | 846.0 | 195.632388 | 7.438797 | 181.0 | 190.25 | 197.0 | 201.00 | 211.0 |
#plotting the distribution of the numerical variables
ve.hist(bins = 20, figsize = (20, 18), color = 'blue')
plt.show()
#Density plots
plt.figure(figsize=(20, 18))
col = 1
for i in ve.drop(columns='class').columns:
plt.subplot(4, 5, col)
sns.distplot(ve[i], color = 'b')
col += 1
#checking the distribution of the class variable
print(ve['class'].value_counts())
plt.title('Count of Vehicle Class column')
sns.countplot(x = 'class', data = ve);
car 429 bus 218 van 199 Name: class, dtype: int64
#boxplots for outliers
plt.figure(figsize=(25,23))
col = 1
for i in ve.drop(columns='class').columns:
plt.subplot(6, 4, col)
sns.boxplot(ve[i],color='blue')
col += 1
#Boxplots against the class with other numeric variables
fig,axs = plt.subplots(4,5,figsize=(20,18))
sns.boxplot(x="class", y="compactness",data=ve,ax=axs[0,0]);
sns.boxplot(x="class", y="circularity",data=ve,ax=axs[0,1]);
sns.boxplot(x="class", y="distance_circularity",data=ve,ax=axs[0,2]);
sns.boxplot(x="class", y="radius_ratio",data=ve,ax=axs[0,3]);
sns.boxplot(x="class", y="pr.axis_aspect_ratio",data=ve,ax=axs[0,4]);
sns.boxplot(x="class", y="max.length_aspect_ratio",data=ve,ax=axs[1,0]);
sns.boxplot(x="class", y="scatter_ratio",data=ve,ax=axs[1,1]);
sns.boxplot(x="class", y="elongatedness",data=ve,ax=axs[1,2]);
sns.boxplot(x="class", y="pr.axis_rectangularity",data=ve,ax=axs[1,3]);
sns.boxplot(x="class", y="max.length_rectangularity",data=ve,ax=axs[1,4]);
sns.boxplot(x="class", y="scaled_variance",data=ve,ax=axs[2,0]);
sns.boxplot(x="class", y="scaled_variance.1",data=ve,ax=axs[2,1]);
sns.boxplot(x="class", y="scaled_radius_of_gyration",data=ve,ax=axs[2,2]);
sns.boxplot(x="class", y="scaled_radius_of_gyration.1",data=ve,ax=axs[2,3]);
sns.boxplot(x="class", y="skewness_about",data=ve,ax=axs[2,4]);
sns.boxplot(x="class", y="skewness_about.1",data=ve,ax=axs[3,0]);
sns.boxplot(x="class", y="skewness_about.2",data=ve,ax=axs[3,1]);
sns.boxplot(x="class", y="hollows_ratio",data=ve,ax=axs[3,2]);
fig.tight_layout()
There is significant difference between classes when compared with the mean and median with all the numeric attributes
#find the outliers and replace them by median
for col_name in ve.drop(columns = 'class').columns:
q1 = ve[col_name].quantile(0.25)
q3 = ve[col_name].quantile(0.75)
iqr = q3 - q1
low = q1 - 1.5 * iqr
high = q3 + 1.5 * iqr
ve.loc[(ve[col_name] < low) | (ve[col_name] > high), col_name] = ve[col_name].median()
#boxplot after outlier treatment
plt.figure(figsize=(25,23))
col = 1
for i in ve.drop(columns='class').columns:
plt.subplot(6, 4, col)
sns.boxplot(ve[i],color='blue')
col += 1
#checking for correlation
plt.figure(figsize=(20,18))
corr=ve.corr()
sns.heatmap(corr,annot=True);
#spliiting the data to check for correlation with the class variable
X = ve.loc[:, ve.columns != 'class']
y = ve['class'].astype('category').cat.codes
#plotting the correlation with target variable
plt.figure(figsize = (15, 8))
ax=sns.barplot(x=X.columns, y = X.corrwith(y))
sns.barplot(x = X.columns, y = X.corrwith(y))
plt.title('Correlation with Class column', fontsize = 20)
x=plt.setp(ax.get_xticklabels(), rotation=90)
PCA
#scaling the muerica variables
XScaled=X.apply(zscore)
XScaled.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.160580 | 0.518073 | 0.057177 | 0.300945 | 1.933135 | 0.912212 | -0.207598 | 0.136262 | -0.224342 | 0.758332 | -0.400771 | -0.337407 | 0.285705 | -0.315806 | -0.032330 | 0.387162 | -0.312012 | 0.183957 |
| 1 | -0.325470 | -0.623732 | 0.120741 | -0.850666 | -0.740596 | 0.427456 | -0.599423 | 0.520519 | -0.610886 | -0.344578 | -0.594220 | -0.618623 | -0.513630 | 0.009122 | 0.624090 | 0.161740 | 0.013265 | 0.452977 |
| 2 | 1.254193 | 0.844303 | 1.519141 | 1.265808 | 0.863642 | 0.912212 | 1.148719 | -1.144597 | 0.935290 | 0.689401 | 1.114582 | 1.131806 | 1.392477 | 0.171586 | 1.718123 | -0.401818 | -0.149374 | 0.049447 |
| 3 | -0.082445 | -0.623732 | -0.006386 | -0.290423 | 0.328896 | 0.427456 | -0.750125 | 0.648605 | -0.610886 | -0.344578 | -0.916635 | -0.739145 | -1.466683 | -1.453054 | -0.032330 | -0.289106 | 1.639649 | 1.529056 |
| 4 | -1.054545 | -0.134387 | -0.769150 | 1.141310 | -0.027601 | -0.057300 | -0.599423 | 0.520519 | -0.610886 | -0.275646 | 1.694930 | -0.647319 | 0.408680 | -0.072110 | 0.624090 | -0.176395 | -1.450481 | -1.699181 |
#plotting the cummulative variance explained by the principal componets
pca = PCA()
X_pca_ = pca.fit_transform(XScaled)
plt.figure(figsize = (12, 8))
plt.plot((np.cumsum(pca.explained_variance_ratio_) * 100), marker = 'X')
plt.xlim(0, 18)
plt.xlabel('Number of Components')
plt.ylabel('Percentage of Cumulative Explained Variance');
print(pca.explained_variance_)
[9.74940269e+00 3.35071912e+00 1.19238155e+00 1.13381916e+00 8.83997312e-01 6.66265745e-01 3.18150910e-01 2.28179142e-01 1.31018595e-01 7.98619108e-02 7.33979478e-02 6.46162669e-02 4.01448646e-02 3.22758478e-02 2.93936408e-02 2.27005257e-02 1.98136761e-02 5.16287320e-03]
#plotting the
plt.figure(figsize = (12, 8))
plt.step(list(range(18)), (np.cumsum(pca.explained_variance_ratio_) * 100), where = 'mid')
plt.xlim(0, 18)
plt.xlabel('Number of Components')
plt.ylabel('Percentage of Cumulative Explained Variance')
plt.title('Vehicle Dataset Explained Variance');
We can see that the first six components explain more than 95% of variation. Between first 5 components, more than 91% of the information is captured. The above plot shows almost 95% variance by the first 6 components. Therefore we can drop 7th component onwards.
#Using 6 components and printing the eigen vectors
pca3 = PCA(n_components=6)
pca3.fit(XScaled)
print(pca3.components_)
print(pca3.explained_variance_ratio_)
Xpca3 = pca3.transform(XScaled)
[[ 0.27250289 0.28725469 0.30242111 0.26971354 0.09786073 0.19520014 0.31052393 -0.3090069 0.307287 0.27815416 0.29976509 0.30553237 0.26323762 -0.04193594 0.03608321 0.05872048 0.03801314 0.08474 ] [-0.08704358 0.13162176 -0.04614301 -0.19793126 -0.25783995 -0.10804563 0.07528535 -0.01322994 0.0875602 0.12215424 0.07726575 0.07150302 0.21058205 0.50362158 -0.01576632 -0.09274624 -0.50162122 -0.50761211] [-0.03818521 -0.20114691 0.06346211 0.05628517 -0.06199275 -0.14895782 0.10904283 -0.09085269 0.1060705 -0.21368469 0.1445998 0.11034374 -0.20287019 0.07386402 -0.55917399 0.6706805 -0.06224071 -0.04170535] [ 0.13867501 -0.03805548 0.10895429 -0.25435509 -0.61276572 0.27867816 0.00539295 0.06521486 0.03089915 0.04146747 -0.06400509 -0.00219687 -0.08553965 -0.11539962 0.47370331 0.42842603 -0.0274096 0.09603749] [ 0.13710147 -0.13899555 -0.08001743 0.13374437 0.12360146 -0.63489336 0.08555745 -0.07907344 0.08164638 -0.25111294 0.14747123 0.11010098 -0.00521211 0.1380686 0.56655224 0.13086982 0.18051929 -0.11078807] [ 0.26361138 -0.07134741 -0.01690062 -0.13818366 -0.57782861 -0.289097 0.09774711 -0.07572829 0.10540323 -0.07819622 0.13291241 0.11539822 -0.0670574 -0.13151308 -0.31917609 -0.46840497 0.28013644 0.05944441]] [0.54099325 0.18593103 0.06616512 0.0629155 0.04905291 0.03697101]
#printing the original features and the reduced features
pca_6 = PCA(n_components = 6)
X_pca = pca_6.fit_transform(XScaled)
print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_pca.shape[1])
Original number of features: 18 Reduced number of features: 6
#viewing the first 5 observations of the pca components
pca_df = pd.DataFrame(data = X_pca)
pca_df.head()
| 0 | 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|---|
| 0 | 0.584228 | -0.675673 | -0.453334 | -0.750656 | -0.777515 | -1.848809 |
| 1 | -1.512180 | -0.348934 | -0.333436 | 1.268953 | -0.324929 | -0.118317 |
| 2 | 3.913448 | 0.234507 | -1.265094 | 0.137224 | 0.915751 | -0.685594 |
| 3 | -1.535193 | -3.044413 | -0.469623 | 0.324317 | -0.611590 | 0.367777 |
| 4 | -0.642062 | 1.488882 | -0.246288 | -0.550939 | 0.471655 | -1.012698 |
sns.pairplot(pca_df, diag_kind = 'kde');
SVM
#splitting the original data into train and test 70:30
X_train, X_test, y_train, y_test = train_test_split(XScaled, y, test_size = 0.3, random_state = 10)
rtr, ctr = X_train.shape
print('The training set comprises of', rtr, 'rows and', ctr, 'columns.')
The training set comprises of 592 rows and 18 columns.
rt, ct = X_test.shape
print('The test set comprises of', rt, 'rows and', ct, 'columns.')
The test set comprises of 254 rows and 18 columns.
#splitting the pca data into train and test 70:30
X_tr, X_te, y_tr, y_te = train_test_split(X_pca, y, test_size = 0.3, random_state = 10)
rtr_pca, ctr_pca = X_tr.shape
print('The PCA training set comprises of', rtr_pca, 'rows and', ctr_pca, 'columns.')
The PCA training set comprises of 592 rows and 6 columns.
rt_pca, ct_pca = X_te.shape
print('The PCA test set comprises of', rt_pca, 'rows and', ct_pca, 'columns.')
The PCA test set comprises of 254 rows and 6 columns.
# Building a Support Vector Machine on train data
svc_model = SVC(C= 4, kernel='rbf', gamma='scale')
svc_model.fit(X_train, y_train)
SVC(C=4)
#predicting on train data
sv_train_predict = svc_model .predict(X_train)
print("Model Accuracy on train: {0:.4f}".format(metrics.accuracy_score(y_train, sv_train_predict)))
print()
Model Accuracy on train: 0.9899
#predicting on test data
sv_test_predict = svc_model .predict(X_test)
print("Model Accuracy on test: {0:.4f}".format(metrics.accuracy_score(y_test, sv_test_predict)))
print()
Model Accuracy on test: 0.9685
#visualization of confusion matrix in the form of a heatmap
cm= confusion_matrix(y_test, sv_test_predict)
plt.figure(figsize = (12, 8))
sns.heatmap(cm, annot = True, cmap = 'RdYlGn', fmt = 'd')
plt.xlabel('Actual Classes', fontsize = 15)
plt.ylabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for SVM', fontsize = 15);
#printing classification report
print("Classification Report")
print(metrics.classification_report(y_test, sv_test_predict, labels=[0,1,2]))
Classification Report
precision recall f1-score support
0 1.00 0.99 0.99 71
1 0.98 0.96 0.97 125
2 0.90 0.97 0.93 58
accuracy 0.97 254
macro avg 0.96 0.97 0.97 254
weighted avg 0.97 0.97 0.97 254
precision_SV, recall_SV, f1_score_SV, support = precision_recall_fscore_support(y_test, sv_test_predict,average='macro')
print('Precision Score :', '%0.2f' % precision_SV)
print('Recall Score :', '%0.2f' % recall_SV)
print('F1-Score:', '%0.2f' % f1_score_SV)
SV_Acc= accuracy_score(y_test, sv_test_predict)
print('Accuracy Score :','%0.2f' % SV_Acc)
Precision Score : 0.96 Recall Score : 0.97 F1-Score: 0.97 Accuracy Score : 0.97
#SVM on the pca data
svc_model_pca = SVC(C= 4, kernel='rbf', gamma='scale')
svc_model_pca.fit(X_tr, y_tr)
SVC(C=4)
#predicting on train data
sv_tr_predict = svc_model_pca .predict(X_tr)
print("Model Accuracy on train: {0:.4f}".format(metrics.accuracy_score(y_tr, sv_tr_predict)))
print()
Model Accuracy on train: 0.9476
#predicting on test data
sv_te_predict = svc_model_pca .predict(X_te)
print("Model Accuracy on test: {0:.4f}".format(metrics.accuracy_score(y_te, sv_te_predict)))
print()
Model Accuracy on test: 0.9213
#visualization of confusion matrix in the form of a heatmap
cm= confusion_matrix(y_te, sv_te_predict)
plt.figure(figsize = (12, 8))
sns.heatmap(cm, annot = True, cmap = 'RdYlGn', fmt = 'd')
plt.xlabel('Actual Classes', fontsize = 15)
plt.ylabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for SVM', fontsize = 15);
#printing classification report
print("Classification Report")
print(metrics.classification_report(y_te, sv_te_predict, labels=[0,1,2]))
Classification Report
precision recall f1-score support
0 0.96 0.94 0.95 71
1 0.94 0.91 0.93 125
2 0.84 0.91 0.88 58
accuracy 0.92 254
macro avg 0.91 0.92 0.92 254
weighted avg 0.92 0.92 0.92 254
precision_SV_pca, recall_SV_pca, f1_score_SV_pca, support_pca = precision_recall_fscore_support(y_te, sv_te_predict,average='macro')
print('Precision Score :', '%0.2f' % precision_SV_pca)
print('Recall Score :', '%0.2f' % recall_SV_pca)
print('F1-Score:', '%0.2f' % f1_score_SV_pca)
SV_Acc_pca= accuracy_score(y_te, sv_te_predict)
print('Accuracy Score :','%0.2f' % SV_Acc_pca)
Precision Score : 0.91 Recall Score : 0.92 F1-Score: 0.92 Accuracy Score : 0.92
modellists = []
modellists.append(['Support Vector Classifier without PCA', SV_Acc * 100, recall_SV * 100, precision_SV * 100,f1_score_SV*100])
modellists.append(['Support Vector Classifier with PCA', SV_Acc_pca* 100, recall_SV_pca * 100, precision_SV_pca * 100,f1_score_SV_pca*100])
mdl_df = pd.DataFrame(modellists, columns = ['Model','Accuracy Score of Test Data', 'Recall Score', 'Precision Score','F1 Score'])
mdl_df
| Model | Accuracy Score of Test Data | Recall Score | Precision Score | F1 Score | |
|---|---|---|---|---|---|
| 0 | Support Vector Classifier without PCA | 96.850394 | 97.047758 | 96.227745 | 96.596702 |
| 1 | Support Vector Classifier with PCA | 92.125984 | 92.315169 | 91.352049 | 91.773898 |
Bothe the model give more than 90% accuracy on the test data, PCA used only 6 attributes to come up with an accuracy of 90%+ where as the model without pca used all the variables to come up with 90%+ accuracy, the difference can be illustrated even better if the dataset had been cursed with dimensionality, since its 18 variable in the original data the difference is very subtle.